SEER Data Analysis

Phase 3: Data Exploration


In [53]:
%matplotlib inline

import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from MasterSeer import MasterSeer
from sklearn.feature_selection import SelectPercentile, f_classif, SelectFromModel
from sklearn.linear_model import LinearRegression
from lifelines.plotting import plot_lifetimes
from lifelines import KaplanMeierFitter
from numpy.random import uniform, exponential
from pandas.tools.plotting import scatter_matrix, radviz, parallel_coordinates

To begin exploring the data we took a sample of the SEER data, defined the features and dependent variable, printed the top few lines to ensure a successful data ingest, and ran descriptive statistics


In [57]:
FEATURES  = [
    "Birth Year",
    "Age at Diagnosis",
    "Race",
    "Origin",
    "laterality",
    "Radiation",
    "Histrec",
    "ER Status",
    "PR Status",
    "Behanal",
    "Stage",
    "Numprimes",
    "Survival Time",
    "Bucket"
]

LABEL_MAP = {
    0: "< 60 Months",
    1: "60 < months > 120",
    2: "> 120 months",
}

# Read the data into a DataFrame
df = pd.read_csv("clean1.csv", sep=',' , header=0, names=FEATURES)


# Convert class labels into text
for k,v in LABEL_MAP.items():
    df.ix[df.Bucket == k, 'Bucket'] = v

print(df.head(n=5))

df.describe()


    Birth Year  Age at Diagnosis  Race  Origin  laterality  Radiation  \
0         1919                82     1       0           1          0   
5         1959                40     1       0           1          0   
8         1931                72     1       0           1          1   
10        1921                84     1       0           1          0   
11        1933                74     1       0           1          1   

    Histrec  ER Status  PR Status  Behanal  Stage  Numprimes  Survival Time  \
0         9          2          0        3      4          1             23   
5         9          2          2        3      1          1            154   
8         9          2          2        3      1          1            104   
10        9          2          2        3      1          1             89   
11        9          2          2        2      0          1             69   

               Bucket  
0         < 60 Months  
5        > 120 months  
8   60 < months > 120  
10  60 < months > 120  
11  60 < months > 120  
Out[57]:
Birth Year Age at Diagnosis Race Origin laterality Radiation Histrec ER Status PR Status Behanal Stage Numprimes Survival Time
count 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000 10497.000000
mean 1942.575212 57.031342 2.612651 0.264838 1.002001 0.593122 8.743832 1.519101 1.338287 2.928360 1.398304 1.160713 120.349624
std 13.475209 12.930236 10.292738 1.253416 0.044685 0.631462 1.212398 0.851179 0.936114 0.257902 0.840551 0.367283 65.418822
min 1894.000000 20.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 1.000000 0.000000
25% 1933.000000 47.000000 1.000000 0.000000 1.000000 0.000000 9.000000 2.000000 0.000000 3.000000 1.000000 1.000000 72.000000
50% 1943.000000 56.000000 1.000000 0.000000 1.000000 1.000000 9.000000 2.000000 2.000000 3.000000 1.000000 1.000000 111.000000
75% 1952.000000 66.000000 1.000000 0.000000 1.000000 1.000000 9.000000 2.000000 2.000000 3.000000 2.000000 1.000000 166.000000
max 1985.000000 100.000000 99.000000 9.000000 2.000000 5.000000 22.000000 2.000000 2.000000 3.000000 4.000000 2.000000 275.000000

Next we checked our data type and determine the frequency of each class


In [58]:
print (df.groupby('Bucket')['Bucket'].count())


Bucket
60 < months > 120    4116
< 60 Months          1606
> 120 months         4775
Name: Bucket, dtype: int64

We used a histogram to see the distribution of survival time in months


In [59]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['Survival Time'], bins = 10, range = (df['Survival Time'].min(),df['Survival Time'].max()))
plt.title('Survival Time Distribution')
plt.xlabel('Survival Time')
plt.ylabel('Months')
plt.show()


Next we played around with a few visualization to get a better understanding of the data


In [60]:
scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()



In [61]:
plt.figure(figsize=(12,12))
parallel_coordinates(df, 'Bucket')
plt.show()


The plot below shows a lot of overlap between the 3 classes which alludes to the fact that classification models may not perform great. However, the plot also shows a more clear classification along the birth year and age at diagnosis features.


In [62]:
plt.figure(figsize=(12,12))
radviz(df, 'Bucket')
plt.show()


Next we moved to creating survival charts using a larger sample size. We created a class with a "plot_survival" function. For the graph we picked variables that the scientific literature finds significant-- Stage, ER status, PR status, age, and radiation treatment. The second plot compares the frequency of survival for censored and non-censored patients.


In [63]:
class ExploreSeer(MasterSeer):

    def __init__(self, path=r'./data/', testMode=False, verbose=True, sample_size=5000):

        # user supplied parameters
        self.testMode = testMode        # import one file, 500 records and return
        self.verbose = verbose          # prints status messages
        self.sample_size = sample_size  # number of rows to pull for testing

        if type(path) != str:
            raise TypeError('path must be a string')

        if path[-1] != '/':
            path += '/'            # if path does not end with a backslash, add one

        self.path = path

        # open connection to the database
        super().__init__(path, False, verbose=verbose)
        self.db_conn, self.db_cur = super().init_database(False)


    def __del__(self):
        super().__del__()

   
    def plot_survival(self):

        df = super().load_data(col  = ['YR_BRTH','AGE_DX','LATERAL','RADIATN','HISTREC','ERSTATUS',
                                       'PRSTATUS','BEHANAL','HST_STGA','NUMPRIMS', 'SRV_TIME_MON', 
                                       'SRV_TIME_MON_PA', 'DTH_CLASS', 'O_DTH_CLASS', 'STAT_REC'], 
                cond = 'SRV_TIME_MON < 1000 AND HST_STGA < 8 AND DTH_CLASS < 9 AND ERSTATUS < 4 AND PRSTATUS < 4', 
                               sample_size = 100000)

        kmf = KaplanMeierFitter()

        try:
            df.RADIATN = df.RADIATN.replace(7, 0)
            df = df[df.RADIATN < 7] 
        except Exception as err:
            pass

        # 0-negative, 1-borderline,, 2-positive
        df = df[df.ERSTATUS != 4]
        df = df[df.ERSTATUS != 9]
        df.ERSTATUS = df.ERSTATUS.replace(2, 0)
        df.ERSTATUS = df.ERSTATUS.replace(1, 2)
        df.ERSTATUS = df.ERSTATUS.replace(3, 1)

        # 0-negative, 1-borderline,, 2-positive
        df = df[df.PRSTATUS != 4]
        df = df[df.PRSTATUS != 9]
        df.PRSTATUS = df.PRSTATUS.replace(2, 0)
        df.PRSTATUS = df.PRSTATUS.replace(1, 2)
        df.PRSTATUS = df.PRSTATUS.replace(3, 1)

        rad = df.RADIATN > 0
        er  = df.ERSTATUS > 0
        pr  = df.PRSTATUS > 0

        st0  = df.HST_STGA == 0
        st1  = df.HST_STGA == 1
        st2  = df.HST_STGA == 2
        st4  = df.HST_STGA == 4

        age = df.AGE_DX < 50


        df['SRV_TIME_YR'] = df['SRV_TIME_MON'] / 12
        T = df['SRV_TIME_YR']
        #C = (np.logical_or(df.DTH_CLASS == 1, df.O_DTH_CLASS == 1))
        C = df.STAT_REC == 4

         
        f, ax = plt.subplots(5, sharex=True, sharey=True)
        ax[0].set_title("Lifespans of cancer patients");

        # radiation
        kmf.fit(T[rad], event_observed=C[rad], label="Radiation")
        kmf.plot(ax=ax[0]) #, ci_force_lines=True)
        kmf.fit(T[~rad], event_observed=C[~rad], label="No Radiation")
        kmf.plot(ax=ax[0]) #, ci_force_lines=True)

        # ER Status
        kmf.fit(T[er], event_observed=C[er], label="ER Positive")
        kmf.plot(ax=ax[1]) #, ci_force_lines=True)
        kmf.fit(T[~er], event_observed=C[~er], label="ER Negative")
        kmf.plot(ax=ax[1]) #, ci_force_lines=True)

        # PR Status
        kmf.fit(T[pr], event_observed=C[pr], label="PR Positive")
        kmf.plot(ax=ax[2]) #, ci_force_lines=True)
        kmf.fit(T[~pr], event_observed=C[~pr], label="PR Negative")
        kmf.plot(ax=ax[2]) #, ci_force_lines=True)

        # stage
        kmf.fit(T[st0], event_observed=C[st0], label="Stage 0")
        kmf.plot(ax=ax[3]) #, ci_force_lines=True)
        kmf.fit(T[st1], event_observed=C[st1], label="Stage 1")
        kmf.plot(ax=ax[3]) #, ci_force_lines=True)
        kmf.fit(T[st2], event_observed=C[st2], label="Stage 2")
        kmf.plot(ax=ax[3]) #, ci_force_lines=True)
        kmf.fit(T[st4], event_observed=C[st4], label="Stage 4")
        kmf.plot(ax=ax[3]) #, ci_force_lines=True)

        # age
        kmf.fit(T[age], event_observed=C[age], label="Age < 50")
        kmf.plot(ax=ax[4]) #, ci_force_lines=True)
        kmf.fit(T[~age], event_observed=C[~age], label="Age >= 50")
        kmf.plot(ax=ax[4]) #, ci_force_lines=True)

        ax[0].legend(loc=3,prop={'size':10})
        ax[1].legend(loc=3,prop={'size':10})
        ax[2].legend(loc=3,prop={'size':10})
        ax[3].legend(loc=3,prop={'size':10})
        ax[4].legend(loc=3,prop={'size':10})

        ax[len(ax)-1].set_xlabel('Survival in years')

        f.text(0.04, 0.5, 'Survival %', va='center', rotation='vertical')
        plt.tight_layout()

        plt.ylim(0,1);
        plt.show()

        f, ax = plt.subplots(2, sharex=True, sharey=True)

        df.hist('SRV_TIME_YR', by=df.STAT_REC != 4, ax=(ax[0], ax[1]))
        ax[0].set_title('Histogram of Non Censored Patients')
        ax[0].set_ylabel('Number of Patients')

        ax[1].set_ylabel('Number of Patients')
        ax[1].set_title('Histogram of Censored Patients')
        ax[1].set_xlabel('Survival in Years')
        plt.show()

        return

        # second plot of survival

        fig, ax = plt.subplots(figsize=(8, 6))

        cen = df[df.STAT_REC != 4].SRV_TIME_MON
        nc = df[df.STAT_REC == 4].SRV_TIME_MON
        cen = cen.sort_values()
        nc = nc.sort_values()

        ax.hlines([x for x in range(len(nc))] , 0, nc , color = 'b', label='Uncensored');
        ax.hlines([x for x in range(len(nc), len(nc)+len(cen))], 0, cen, color = 'r', label='Censored');

        ax.set_xlim(left=0);
        ax.set_xlabel('Months');
        ax.set_ylim(-0.25, len(df) + 0.25);
        ax.legend(loc='best');
        plt.show()

        return

In [64]:
seer = ExploreSeer(sample_size=10000)
seer.plot_survival()


Database initialized

In [ ]: